# #red wines
# Red_wine <- read.csv("wineQualityReds.csv", header=TRUE, sep = ",")
# Red_wine$Type <- 'Red'
#
# #white wines
# White_wine <- read.csv("wineQualityWhites.csv", header=TRUE, sep = ",")
# White_wine$Type <- 'White'
#
# ## consolidated
# Data <- rbind(Red_wine,White_wine)
# drops <- c("X")
# Data <- Data[ , !(names(Data) %in% drops)]
# Data
#
# ##create final DF
# # write.csv(Data,"/Users/colinobrien/Desktop/repo/stats_6021/Stats_project_group_6/Data.csv", row.names = FALSE)
# # write.csv(Data,"/Users/colinobrien/Desktop/repo/stats_6021/Stats_project_group_6/Data", row.names = FALSE)
# ## both the Data and Data csv are the same. I know people prefer one format vs the other so I made both
library(tidyverse)
# library(ROCR)
library(faraway)
library(dplyr)
library(ggplot2)
library(reshape2)
library(leaps)
# install.packages("bestglm")
library(bestglm)
# install.packages("performance")
# library(performance)
knitr::opts_chunk$set(echo = TRUE)
## Load Datasets
full_wines_final <- read.csv("Data_Final.csv", header = TRUE, stringsAsFactors=TRUE)
# Drop quality for simplicity
full_wines_binary_with_qual<-full_wines_final
full_wines_binary <- subset(full_wines_final, select = -c(quality))
## Convert to 0 and 1 for readability
full_wines_binary$cat_quality <- as.integer(full_wines_binary$cat_quality == "High")
set.seed(90210) ##for reproducibility
sample<-sample.int(nrow(full_wines_binary), floor(.80*nrow(full_wines_binary)), replace = F)
train<-full_wines_binary[sample, ] ##training data frame
rownames(train) <- c(1:5197)
test<-full_wines_binary[-sample, ] ##test data frame
## Just for a single boxplot
train_with_qual<-full_wines_binary_with_qual[sample,]
test_with_qual<-full_wines_binary_with_qual[-sample,]
train
# drops_cats <- c("Type")
# No_cat_train <- train[ , !(names(train) %in% drops_cats)]
# # No_Type
pairs(train, lower.panel = NULL)
# Convert Type to binary to 0 and 1 for correlation
train$Type <- as.integer(train$Type == "White")
test$Type <- as.integer(test$Type == "White")
cor_train <- cor(train)
cor_train
fixed.acidity volatile.acidity citric.acid
fixed.acidity 1.00000000 0.21475470 0.3281717780
volatile.acidity 0.21475470 1.00000000 -0.3756083338
citric.acid 0.32817178 -0.37560833 1.0000000000
residual.sugar -0.11435784 -0.19711519 0.1413532761
chlorides 0.30907486 0.38572698 0.0368162603
free.sulfur.dioxide -0.28545891 -0.36296634 0.1459151441
total.sulfur.dioxide -0.32829667 -0.42380894 0.2063508767
density 0.46102677 0.27340816 0.0934734894
pH -0.25053297 0.26430327 -0.3266522997
sulphates 0.31025439 0.23565770 0.0572809414
alcohol -0.09179407 -0.03401873 -0.0006471143
Type -0.48713134 -0.65599571 0.1886904967
cat_quality -0.07066799 -0.26538562 0.0778223488
residual.sugar chlorides free.sulfur.dioxide
fixed.acidity -0.11435784 0.30907486 -0.28545891
volatile.acidity -0.19711519 0.38572698 -0.36296634
citric.acid 0.14135328 0.03681626 0.14591514
residual.sugar 1.00000000 -0.13267263 0.40674512
chlorides -0.13267263 1.00000000 -0.20583891
free.sulfur.dioxide 0.40674512 -0.20583891 1.00000000
total.sulfur.dioxide 0.49459459 -0.29216595 0.71557904
density 0.54686423 0.36564000 0.01712365
pH -0.26561668 0.04191555 -0.15788076
sulphates -0.18182649 0.40162365 -0.19860411
alcohol -0.34842765 -0.25617404 -0.17708863
Type 0.35126769 -0.52157700 0.48355395
cat_quality -0.02120537 -0.18458731 0.04477089
total.sulfur.dioxide density pH
fixed.acidity -0.32829667 0.46102677 -0.25053297
volatile.acidity -0.42380894 0.27340816 0.26430327
citric.acid 0.20635088 0.09347349 -0.32665230
residual.sugar 0.49459459 0.54686423 -0.26561668
chlorides -0.29216595 0.36564000 0.04191555
free.sulfur.dioxide 0.71557904 0.01712365 -0.15788076
total.sulfur.dioxide 1.00000000 0.02373810 -0.24648617
density 0.02373810 1.00000000 0.01687243
pH -0.24648617 0.01687243 1.00000000
sulphates -0.27752789 0.27270901 0.18260830
alcohol -0.26082628 -0.67847127 0.11700658
Type 0.70618430 -0.39538819 -0.33097301
cat_quality -0.04308093 -0.26437860 0.01854094
sulphates alcohol Type cat_quality
fixed.acidity 0.31025439 -0.0917940736 -0.48713134 -0.07066799
volatile.acidity 0.23565770 -0.0340187335 -0.65599571 -0.26538562
citric.acid 0.05728094 -0.0006471143 0.18869050 0.07782235
residual.sugar -0.18182649 -0.3484276488 0.35126769 -0.02120537
chlorides 0.40162365 -0.2561740351 -0.52157700 -0.18458731
free.sulfur.dioxide -0.19860411 -0.1770886318 0.48355395 0.04477089
total.sulfur.dioxide -0.27752789 -0.2608262814 0.70618430 -0.04308093
density 0.27270901 -0.6784712690 -0.39538819 -0.26437860
pH 0.18260830 0.1170065815 -0.33097301 0.01854094
sulphates 1.00000000 -0.0136065691 -0.49215645 0.03195034
alcohol -0.01360657 1.0000000000 0.03945167 0.39668183
Type -0.49215645 0.0394516700 1.00000000 0.12361294
cat_quality 0.03195034 0.3966818275 0.12361294 1.00000000
T_F_cor <- abs(cor_train)>.7
T_F_cor
fixed.acidity volatile.acidity citric.acid
fixed.acidity TRUE FALSE FALSE
volatile.acidity FALSE TRUE FALSE
citric.acid FALSE FALSE TRUE
residual.sugar FALSE FALSE FALSE
chlorides FALSE FALSE FALSE
free.sulfur.dioxide FALSE FALSE FALSE
total.sulfur.dioxide FALSE FALSE FALSE
density FALSE FALSE FALSE
pH FALSE FALSE FALSE
sulphates FALSE FALSE FALSE
alcohol FALSE FALSE FALSE
Type FALSE FALSE FALSE
cat_quality FALSE FALSE FALSE
residual.sugar chlorides free.sulfur.dioxide
fixed.acidity FALSE FALSE FALSE
volatile.acidity FALSE FALSE FALSE
citric.acid FALSE FALSE FALSE
residual.sugar TRUE FALSE FALSE
chlorides FALSE TRUE FALSE
free.sulfur.dioxide FALSE FALSE TRUE
total.sulfur.dioxide FALSE FALSE TRUE
density FALSE FALSE FALSE
pH FALSE FALSE FALSE
sulphates FALSE FALSE FALSE
alcohol FALSE FALSE FALSE
Type FALSE FALSE FALSE
cat_quality FALSE FALSE FALSE
total.sulfur.dioxide density pH sulphates alcohol
fixed.acidity FALSE FALSE FALSE FALSE FALSE
volatile.acidity FALSE FALSE FALSE FALSE FALSE
citric.acid FALSE FALSE FALSE FALSE FALSE
residual.sugar FALSE FALSE FALSE FALSE FALSE
chlorides FALSE FALSE FALSE FALSE FALSE
free.sulfur.dioxide TRUE FALSE FALSE FALSE FALSE
total.sulfur.dioxide TRUE FALSE FALSE FALSE FALSE
density FALSE TRUE FALSE FALSE FALSE
pH FALSE FALSE TRUE FALSE FALSE
sulphates FALSE FALSE FALSE TRUE FALSE
alcohol FALSE FALSE FALSE FALSE TRUE
Type TRUE FALSE FALSE FALSE FALSE
cat_quality FALSE FALSE FALSE FALSE FALSE
Type cat_quality
fixed.acidity FALSE FALSE
volatile.acidity FALSE FALSE
citric.acid FALSE FALSE
residual.sugar FALSE FALSE
chlorides FALSE FALSE
free.sulfur.dioxide FALSE FALSE
total.sulfur.dioxide TRUE FALSE
density FALSE FALSE
pH FALSE FALSE
sulphates FALSE FALSE
alcohol FALSE FALSE
Type TRUE FALSE
cat_quality FALSE TRUE
## create melted
melted_cor_train <- melt(cor_train)
##create heat map Consolidated
ggplot(data = melted_cor_train, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Consolidated (Both Red and White)')
## creating red and white
train_White <- filter(train, Type == 1)
train_Red <- filter(train, Type == 0)
## droping red/white columns
train_White_NoType <- subset(train_White, select = -c(Type))
train_Red_NoType <- subset(train_Red, select = -c(Type))
## creating correlations
cor_train_White_NoType <- cor(train_White_NoType)
cor_train_Red_NoType <- cor(train_Red_NoType)
## melting
melted_cor_train_white <- melt(cor_train_White_NoType)
melted_cor_train_Red <- melt(cor_train_Red_NoType)
##ploting
##create heat map White
ggplot(data = melted_cor_train_white, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'White Wine')
##create heat map Red
ggplot(data = melted_cor_train_Red, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Red Wine')
ggplot(data = train, mapping = aes(x=Type)) +
geom_bar()
## press formula (from class)
get_press <- function(model) {
sum(((model$residuals)/ (1- (lm.influence(model)$hat)))^2)
}
## first go
full<-glm(cat_quality~., family=binomial, data=train)
summary(full)
Call:
glm(formula = cat_quality ~ ., family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4120 -0.8919 0.4303 0.8148 2.6198
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.102e+02 4.889e+01 2.253 0.0242 *
fixed.acidity 7.676e-02 5.573e-02 1.377 0.1684
volatile.acidity -4.720e+00 3.291e-01 -14.342 < 2e-16 ***
citric.acid -4.652e-01 2.875e-01 -1.618 0.1057
residual.sugar 1.187e-01 2.104e-02 5.641 1.69e-08 ***
chlorides -1.265e+00 1.195e+00 -1.059 0.2895
free.sulfur.dioxide 1.345e-02 2.878e-03 4.672 2.98e-06 ***
total.sulfur.dioxide -5.753e-03 1.168e-03 -4.924 8.48e-07 ***
density -1.211e+02 4.964e+01 -2.440 0.0147 *
pH 7.375e-01 3.316e-01 2.224 0.0261 *
sulphates 2.096e+00 2.977e-01 7.042 1.89e-12 ***
alcohol 8.500e-01 6.635e-02 12.811 < 2e-16 ***
Type -5.562e-01 2.077e-01 -2.678 0.0074 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5335.5 on 5184 degrees of freedom
AIC: 5361.5
Number of Fisher Scoring iterations: 5
## removed all insignificant
reduced_1<-glm(formula = cat_quality~volatile.acidity+residual.sugar+free.sulfur.dioxide+total.sulfur.dioxide+density+pH+sulphates+alcohol+Type, family=binomial, data=train)
summary(reduced_1)
Call:
glm(formula = cat_quality ~ volatile.acidity + residual.sugar +
free.sulfur.dioxide + total.sulfur.dioxide + density + pH +
sulphates + alcohol + Type, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.3471 -0.9020 0.4275 0.8196 2.6625
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 76.416878 31.773373 2.405 0.01617 *
volatile.acidity -4.680054 0.305603 -15.314 < 2e-16 ***
residual.sugar 0.105875 0.014724 7.191 6.44e-13 ***
free.sulfur.dioxide 0.013183 0.002863 4.604 4.15e-06 ***
total.sulfur.dioxide -0.005930 0.001162 -5.101 3.38e-07 ***
density -86.733244 31.675484 -2.738 0.00618 **
pH 0.595611 0.228953 2.601 0.00928 **
sulphates 1.930494 0.285976 6.751 1.47e-11 ***
alcohol 0.894713 0.051370 17.417 < 2e-16 ***
Type -0.523798 0.204855 -2.557 0.01056 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5341.5 on 5187 degrees of freedom
AIC: 5361.5
Number of Fisher Scoring iterations: 5
##evaluating model
Reduced1_AIC_train <- reduced_1$aic
##predicted quality for test data based on training data
preds<-predict(reduced_1,newdata=test, type="response")
reduced_1_error <- table(test$cat_quality, preds>0.5)
reduced_1_error
FALSE TRUE
0 240 236
1 121 703
evulation_summary <- data.frame(
attempt = 'reduced_1',
AIC = Reduced1_AIC_train,
PRESS = get_press(reduced_1),
'False positive' = round(reduced_1_error[3]/(reduced_1_error[1]+reduced_1_error[3]),3),
'False negative' = round(reduced_1_error[2]/(reduced_1_error[2]+reduced_1_error[4]),3),
'Error Rate' = round((reduced_1_error[2]+reduced_1_error[3])/(reduced_1_error[1]+reduced_1_error[2]+reduced_1_error[3]+reduced_1_error[4]),3)
)
evulation_summary
# install.packages("bestglm")
## Prepare data
train.for.best.logistic <- within(train, {
y <- cat_quality
})
## Reorder variables
train.for.best.logistic <-
train.for.best.logistic[, c("fixed.acidity","volatile.acidity","citric.acid","residual.sugar","total.sulfur.dioxide","density","chlorides","free.sulfur.dioxide",'pH','sulphates','alcohol','Type',"y")]
## Perform
res.best.logistic <-
bestglm(Xy = train.for.best.logistic,
family = binomial, # binomial family for logistic
IC = "AIC", # Information criteria for
method = "exhaustive")
Morgan-Tatar search since family is non-gaussian.
res.best.logistic$BestModels
summary(res.best.logistic$BestModel)
Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4302 -0.8932 0.4282 0.8157 2.6350
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.147e+02 4.864e+01 2.358 0.0184 *
fixed.acidity 8.518e-02 5.513e-02 1.545 0.1223
volatile.acidity -4.763e+00 3.268e-01 -14.574 < 2e-16 ***
citric.acid -5.158e-01 2.834e-01 -1.820 0.0688 .
residual.sugar 1.215e-01 2.085e-02 5.829 5.57e-09 ***
total.sulfur.dioxide -5.688e-03 1.167e-03 -4.876 1.08e-06 ***
density -1.261e+02 4.936e+01 -2.554 0.0106 *
free.sulfur.dioxide 1.332e-02 2.874e-03 4.633 3.61e-06 ***
pH 8.012e-01 3.261e-01 2.457 0.0140 *
sulphates 2.027e+00 2.897e-01 6.998 2.59e-12 ***
alcohol 8.553e-01 6.613e-02 12.934 < 2e-16 ***
Type -5.290e-01 2.059e-01 -2.569 0.0102 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5336.7 on 5185 degrees of freedom
AIC: 5360.7
Number of Fisher Scoring iterations: 5
reduced_4 <- res.best.logistic$BestModel
##evaluating model
Reduced4_AIC_train <- reduced_4$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4,newdata=test, type="response")
reduced_4_error <- table(test$cat_quality, preds>0.5)
evulation_summary_4 <- data.frame(
attempt = 'reduced_4_error (all possible)',
AIC = Reduced4_AIC_train,
PRESS = get_press(reduced_4),
'False positive' = round(reduced_4_error[3]/(reduced_4_error[1]+reduced_4_error[3]),3),
'False negative' = round(reduced_4_error[2]/(reduced_4_error[2]+reduced_4_error[4]),3),
'Error Rate' = round((reduced_4_error[2]+reduced_4_error[3])/(reduced_4_error[1]+reduced_4_error[2]+reduced_4_error[3]+reduced_4_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4)
# evulation_summary
# data.frame(check_collinearity(reduced_4))
#come back and add df stuff
reduced_4_2<-glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train)
summary(reduced_4_2)
Call:
glm(formula = cat_quality ~ volatile.acidity + citric.acid +
residual.sugar + total.sulfur.dioxide + density + free.sulfur.dioxide +
pH + sulphates + alcohol + Type, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.3469 -0.8997 0.4283 0.8144 2.6767
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 58.609496 33.904591 1.729 0.0839 .
volatile.acidity -4.841743 0.323737 -14.956 < 2e-16 ***
citric.acid -0.434875 0.278458 -1.562 0.1184
residual.sugar 0.099352 0.015302 6.493 8.43e-11 ***
total.sulfur.dioxide -0.005815 0.001165 -4.993 5.94e-07 ***
density -68.496442 33.901654 -2.020 0.0433 *
free.sulfur.dioxide 0.013292 0.002871 4.630 3.66e-06 ***
pH 0.465949 0.243355 1.915 0.0555 .
sulphates 1.962718 0.287251 6.833 8.33e-12 ***
alcohol 0.918353 0.053777 17.077 < 2e-16 ***
Type -0.483884 0.206617 -2.342 0.0192 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5339.0 on 5186 degrees of freedom
AIC: 5361
Number of Fisher Scoring iterations: 5
##evaluating model
Reduced4_2_AIC_train <- reduced_4_2$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_2,newdata=test, type="response")
reduced_4_2_error <- table(test$cat_quality, preds>0.7)
#Curves
evulation_summary_4_2 <- data.frame(
attempt = 'reduced_4_2_error (post VIF adjustments)',
AIC = Reduced4_2_AIC_train,
PRESS = get_press(reduced_4_2),
'False positive' = round(reduced_4_2_error[3]/(reduced_4_2_error[1]+reduced_4_2_error[3]),3),
'False negative' = round(reduced_4_2_error[2]/(reduced_4_2_error[2]+reduced_4_2_error[4]),3),
'Error Rate' = round((reduced_4_2_error[2]+reduced_4_2_error[3])/(reduced_4_2_error[1]+reduced_4_2_error[2]+reduced_4_2_error[3]+reduced_4_2_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_2)
evulation_summary
NA
NA
summary(reduced_4)
Call:
glm(formula = y ~ ., family = family, data = Xi, weights = weights)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4302 -0.8932 0.4282 0.8157 2.6350
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.147e+02 4.864e+01 2.358 0.0184 *
fixed.acidity 8.518e-02 5.513e-02 1.545 0.1223
volatile.acidity -4.763e+00 3.268e-01 -14.574 < 2e-16 ***
citric.acid -5.158e-01 2.834e-01 -1.820 0.0688 .
residual.sugar 1.215e-01 2.085e-02 5.829 5.57e-09 ***
total.sulfur.dioxide -5.688e-03 1.167e-03 -4.876 1.08e-06 ***
density -1.261e+02 4.936e+01 -2.554 0.0106 *
free.sulfur.dioxide 1.332e-02 2.874e-03 4.633 3.61e-06 ***
pH 8.012e-01 3.261e-01 2.457 0.0140 *
sulphates 2.027e+00 2.897e-01 6.998 2.59e-12 ***
alcohol 8.553e-01 6.613e-02 12.934 < 2e-16 ***
Type -5.290e-01 2.059e-01 -2.569 0.0102 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5336.7 on 5185 degrees of freedom
AIC: 5360.7
Number of Fisher Scoring iterations: 5
p <- 12
n <- 5197
reduced_4_cook <-cooks.distance(reduced_4)
reduced_4_cook[reduced_4_cook>qf(0.5,p,n-p)]
named numeric(0)
##dffits
DFFITS<-dffits(reduced_4)
DDFFITS_influence <- DFFITS[abs(DFFITS)>2*sqrt(p/n)]
DDFFITS_influence
83 115 133 148 149 171
0.11219627 0.10544184 -0.18411002 -0.09991644 -0.12785114 -0.11374142
183 202 206 225 249 292
0.10867107 -0.16045065 -0.11840913 0.10093853 -0.09841249 0.09816267
309 317 380 455 466 572
0.10078385 0.10471012 -0.11210579 0.09920619 -0.11390124 0.10171119
722 739 747 794 806 912
-0.16019151 0.11513457 0.09736192 -0.10449640 -0.10120073 0.15167391
952 957 1010 1042 1055 1114
0.09704330 -0.16243046 0.11892944 -0.09863713 -0.11344570 -0.10497624
1139 1180 1241 1258 1270 1325
0.10300372 -0.18433803 0.10969510 0.09821539 0.10530848 -0.09678052
1347 1467 1493 1527 1528 1597
-0.10103881 0.11642941 -0.11091048 -0.11561143 -0.12352731 -0.09863589
1620 1658 1693 1776 1846 1873
-0.10722071 -0.09731307 -0.09795498 -0.10237941 0.12561590 -0.12529594
1882 1891 2002 2004 2020 2079
-0.10642853 0.10446926 0.10894528 0.20125104 0.09637786 -0.10426201
2103 2148 2203 2256 2258 2285
0.09853506 -0.27921127 -0.10129823 -0.10012511 0.10312207 0.10896369
2305 2350 2353 2361 2383 2410
0.13774801 0.09853162 -0.15221240 -0.11012001 -0.10852123 -0.12636270
2434 2461 2465 2487 2494 2497
0.12603788 -0.15438799 -0.10048416 0.10125735 -0.12151311 0.10319545
2521 2525 2527 2551 2594 2650
-0.10237941 -0.10131647 -0.10242317 -0.09669244 -0.10722071 -0.10178224
2696 2699 2772 2781 2801 2840
-0.11840913 -0.10221624 0.20008361 -0.10703304 -0.11210579 0.16042062
2852 2892 2948 2965 2978 2994
-0.10974330 0.09732574 -0.09841249 -0.09647583 -0.11806742 0.14196649
2995 2997 3018 3030 3041 3051
0.09920619 -0.13745390 0.10099437 -0.14516539 0.16695970 -0.12151311
3065 3079 3126 3155 3188 3274
-0.21978199 0.10160629 0.17165718 0.13048253 0.17504970 -0.10882370
3282 3330 3347 3367 3378 3381
0.10171119 0.09880694 -0.11538312 -0.11374142 -0.10444247 0.10177520
3496 3526 3557 3564 3571 3709
0.12512066 -0.15993715 -0.13072803 -0.13765634 -0.11317090 -0.11344570
3721 3740 3752 3809 3823 3843
-0.15438799 -0.10404207 -0.15579966 -0.10786452 0.88209659 0.10113003
3848 3895 3922 4024 4035 4065
0.11006220 -0.18174866 -0.12028121 -0.10640867 -0.11091048 0.09637786
4071 4112 4125 4131 4132 4140
-0.31948427 -0.12024521 -0.10057784 -0.10147570 0.17165718 -0.11174867
4260 4300 4401 4425 4463 4490
-0.11267738 0.10499625 -0.10880237 -0.10594437 -0.12634536 -0.09991644
4512 4547 4610 4627 4678 4687
-0.09647583 -0.17220454 0.10741187 -0.21859387 -0.10392610 0.12149435
4783 4790 4794 4829 4873 5045
0.12315874 0.10969510 -0.09878535 -0.11104591 -0.16746164 -0.10529190
5091 5178 5187
-0.10703304 0.11264144 0.10894528
DFBETAS<-dfbetas(reduced_4)
abs(DFBETAS)>2/sqrt(n)
(Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
1 FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE TRUE FALSE TRUE TRUE
4 FALSE FALSE FALSE FALSE FALSE
5 FALSE FALSE FALSE FALSE FALSE
6 FALSE FALSE FALSE FALSE FALSE
7 FALSE FALSE FALSE FALSE FALSE
8 FALSE FALSE FALSE FALSE TRUE
9 FALSE FALSE FALSE FALSE FALSE
10 TRUE TRUE FALSE TRUE TRUE
11 FALSE FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE FALSE
14 FALSE FALSE FALSE FALSE FALSE
15 FALSE FALSE FALSE FALSE FALSE
16 FALSE FALSE FALSE FALSE FALSE
17 FALSE FALSE FALSE FALSE FALSE
18 FALSE FALSE FALSE FALSE FALSE
19 FALSE FALSE FALSE FALSE FALSE
20 FALSE FALSE FALSE FALSE FALSE
21 FALSE FALSE FALSE TRUE FALSE
22 FALSE FALSE FALSE FALSE FALSE
23 FALSE TRUE FALSE FALSE TRUE
24 FALSE FALSE FALSE FALSE FALSE
25 FALSE TRUE TRUE FALSE FALSE
26 FALSE FALSE FALSE FALSE FALSE
27 FALSE FALSE FALSE FALSE FALSE
28 FALSE FALSE FALSE FALSE FALSE
29 FALSE FALSE FALSE FALSE FALSE
30 FALSE FALSE FALSE FALSE FALSE
31 FALSE FALSE FALSE FALSE FALSE
32 FALSE FALSE FALSE FALSE FALSE
33 FALSE FALSE FALSE FALSE FALSE
34 FALSE FALSE FALSE FALSE FALSE
35 FALSE FALSE FALSE FALSE FALSE
36 FALSE FALSE FALSE FALSE FALSE
37 FALSE FALSE FALSE FALSE FALSE
38 FALSE FALSE FALSE FALSE FALSE
39 FALSE FALSE FALSE FALSE FALSE
40 FALSE FALSE FALSE FALSE FALSE
41 FALSE FALSE TRUE FALSE FALSE
42 FALSE FALSE FALSE FALSE FALSE
43 FALSE FALSE FALSE FALSE FALSE
44 FALSE FALSE FALSE FALSE FALSE
45 FALSE FALSE FALSE FALSE FALSE
46 FALSE FALSE FALSE FALSE FALSE
47 FALSE FALSE FALSE FALSE TRUE
48 FALSE FALSE FALSE TRUE FALSE
49 FALSE FALSE FALSE FALSE FALSE
50 FALSE FALSE FALSE FALSE FALSE
51 FALSE FALSE FALSE FALSE FALSE
52 FALSE FALSE FALSE FALSE FALSE
53 FALSE FALSE FALSE FALSE FALSE
54 FALSE FALSE FALSE FALSE FALSE
55 FALSE FALSE FALSE FALSE FALSE
56 FALSE FALSE FALSE FALSE FALSE
57 FALSE FALSE FALSE FALSE FALSE
58 FALSE FALSE FALSE FALSE FALSE
59 FALSE FALSE TRUE TRUE FALSE
60 FALSE FALSE FALSE FALSE FALSE
61 FALSE FALSE FALSE FALSE FALSE
62 FALSE FALSE FALSE FALSE FALSE
63 FALSE FALSE FALSE FALSE FALSE
64 FALSE FALSE FALSE FALSE FALSE
65 TRUE TRUE FALSE FALSE TRUE
66 FALSE FALSE FALSE FALSE FALSE
67 FALSE FALSE FALSE FALSE FALSE
68 FALSE FALSE FALSE FALSE FALSE
69 FALSE FALSE FALSE FALSE FALSE
70 FALSE FALSE FALSE FALSE FALSE
71 FALSE FALSE FALSE FALSE FALSE
72 FALSE FALSE FALSE FALSE FALSE
73 FALSE FALSE FALSE FALSE FALSE
74 FALSE FALSE FALSE FALSE FALSE
75 FALSE FALSE FALSE FALSE FALSE
76 FALSE FALSE FALSE FALSE FALSE
77 FALSE FALSE FALSE FALSE FALSE
78 FALSE FALSE FALSE FALSE FALSE
79 FALSE FALSE FALSE FALSE FALSE
80 FALSE FALSE FALSE FALSE FALSE
81 FALSE TRUE FALSE FALSE FALSE
82 FALSE FALSE FALSE FALSE FALSE
83 FALSE TRUE TRUE TRUE FALSE
total.sulfur.dioxide density free.sulfur.dioxide pH sulphates
1 FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE TRUE FALSE
4 FALSE FALSE FALSE FALSE FALSE
5 FALSE FALSE FALSE FALSE FALSE
6 FALSE FALSE FALSE FALSE FALSE
7 FALSE FALSE FALSE FALSE FALSE
8 FALSE FALSE TRUE FALSE TRUE
9 FALSE FALSE FALSE FALSE FALSE
10 TRUE TRUE TRUE FALSE FALSE
11 FALSE FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE FALSE
14 FALSE FALSE FALSE FALSE FALSE
15 FALSE FALSE FALSE FALSE TRUE
16 FALSE FALSE FALSE FALSE FALSE
17 FALSE FALSE FALSE FALSE TRUE
18 FALSE FALSE FALSE TRUE FALSE
19 FALSE FALSE FALSE FALSE FALSE
20 FALSE FALSE FALSE FALSE FALSE
21 FALSE FALSE FALSE FALSE FALSE
22 FALSE FALSE FALSE FALSE FALSE
23 FALSE FALSE FALSE FALSE FALSE
24 FALSE FALSE FALSE FALSE FALSE
25 FALSE FALSE FALSE FALSE FALSE
26 FALSE FALSE FALSE FALSE FALSE
27 FALSE FALSE FALSE FALSE FALSE
28 FALSE FALSE FALSE FALSE FALSE
29 FALSE FALSE FALSE FALSE FALSE
30 FALSE FALSE FALSE FALSE FALSE
31 FALSE FALSE FALSE FALSE FALSE
32 FALSE FALSE FALSE FALSE FALSE
33 FALSE FALSE FALSE FALSE FALSE
34 FALSE FALSE FALSE FALSE FALSE
35 FALSE FALSE FALSE FALSE FALSE
36 FALSE FALSE FALSE FALSE FALSE
37 FALSE FALSE FALSE FALSE FALSE
38 FALSE FALSE FALSE FALSE FALSE
39 FALSE FALSE FALSE FALSE FALSE
40 FALSE FALSE FALSE FALSE FALSE
41 FALSE FALSE FALSE FALSE TRUE
42 FALSE FALSE FALSE FALSE FALSE
43 FALSE FALSE TRUE FALSE TRUE
44 FALSE FALSE FALSE FALSE FALSE
45 FALSE FALSE FALSE FALSE FALSE
46 FALSE FALSE FALSE FALSE FALSE
47 FALSE FALSE FALSE TRUE FALSE
48 FALSE FALSE FALSE FALSE FALSE
49 FALSE FALSE FALSE FALSE FALSE
50 FALSE FALSE FALSE FALSE FALSE
51 FALSE FALSE FALSE FALSE FALSE
52 FALSE FALSE FALSE FALSE FALSE
53 FALSE FALSE FALSE FALSE FALSE
54 FALSE FALSE FALSE FALSE FALSE
55 FALSE FALSE FALSE FALSE FALSE
56 FALSE FALSE FALSE FALSE FALSE
57 FALSE FALSE FALSE FALSE FALSE
58 FALSE FALSE FALSE FALSE FALSE
59 FALSE FALSE FALSE FALSE FALSE
60 FALSE FALSE FALSE FALSE FALSE
61 FALSE FALSE FALSE TRUE FALSE
62 FALSE FALSE FALSE FALSE FALSE
63 FALSE FALSE FALSE FALSE FALSE
64 FALSE FALSE FALSE FALSE FALSE
65 FALSE TRUE FALSE TRUE FALSE
66 FALSE FALSE FALSE FALSE FALSE
67 FALSE FALSE FALSE FALSE FALSE
68 FALSE FALSE FALSE FALSE FALSE
69 FALSE FALSE FALSE FALSE FALSE
70 FALSE FALSE TRUE FALSE FALSE
71 FALSE FALSE FALSE FALSE FALSE
72 FALSE FALSE FALSE FALSE FALSE
73 FALSE FALSE FALSE FALSE FALSE
74 FALSE FALSE FALSE FALSE FALSE
75 FALSE FALSE FALSE FALSE FALSE
76 FALSE FALSE FALSE FALSE FALSE
77 FALSE FALSE FALSE FALSE FALSE
78 FALSE FALSE FALSE FALSE FALSE
79 FALSE FALSE FALSE FALSE FALSE
80 FALSE FALSE FALSE FALSE FALSE
81 FALSE FALSE FALSE TRUE FALSE
82 FALSE FALSE FALSE FALSE FALSE
83 FALSE FALSE FALSE TRUE FALSE
alcohol Type
1 FALSE FALSE
2 FALSE FALSE
3 FALSE FALSE
4 FALSE FALSE
5 FALSE FALSE
6 FALSE TRUE
7 FALSE FALSE
8 FALSE FALSE
9 FALSE FALSE
10 TRUE TRUE
11 FALSE FALSE
12 FALSE FALSE
13 FALSE FALSE
14 FALSE FALSE
15 FALSE FALSE
16 FALSE FALSE
17 FALSE FALSE
18 FALSE FALSE
19 FALSE FALSE
20 FALSE FALSE
21 FALSE FALSE
22 FALSE FALSE
23 FALSE FALSE
24 FALSE FALSE
25 FALSE FALSE
26 FALSE FALSE
27 FALSE FALSE
28 FALSE FALSE
29 FALSE FALSE
30 FALSE FALSE
31 FALSE FALSE
32 FALSE FALSE
33 FALSE FALSE
34 FALSE FALSE
35 FALSE FALSE
36 FALSE FALSE
37 FALSE FALSE
38 FALSE FALSE
39 FALSE FALSE
40 FALSE FALSE
41 FALSE TRUE
42 FALSE FALSE
43 FALSE FALSE
44 FALSE FALSE
45 FALSE FALSE
46 FALSE FALSE
47 FALSE FALSE
48 FALSE FALSE
49 FALSE FALSE
50 FALSE FALSE
51 FALSE FALSE
52 FALSE FALSE
53 FALSE FALSE
54 FALSE FALSE
55 FALSE FALSE
56 FALSE FALSE
57 FALSE FALSE
58 FALSE FALSE
59 FALSE TRUE
60 FALSE FALSE
61 FALSE FALSE
62 FALSE FALSE
63 FALSE FALSE
64 FALSE FALSE
65 TRUE FALSE
66 FALSE FALSE
67 FALSE FALSE
68 FALSE FALSE
69 FALSE FALSE
70 FALSE FALSE
71 FALSE FALSE
72 FALSE FALSE
73 FALSE FALSE
74 FALSE FALSE
75 FALSE FALSE
76 FALSE FALSE
77 FALSE FALSE
78 FALSE FALSE
79 FALSE FALSE
80 FALSE FALSE
81 FALSE FALSE
82 FALSE FALSE
83 FALSE TRUE
[ reached getOption("max.print") -- omitted 5114 rows ]
##leverages
lev<-lm.influence(reduced_4)$hat
##identify high leverage points
leverages <- lev[lev>2*p/n]
leverages
25 48 53 65 133 145
0.004849640 0.007980011 0.007028687 0.005763404 0.022975591 0.005173793
171 175 187 202 225 228
0.005219474 0.004875721 0.006161515 0.005447858 0.006756723 0.005384983
248 249 255 282 299 309
0.008357256 0.004801962 0.005867337 0.005674515 0.005484537 0.010250038
332 333 339 345 361 380
0.004754212 0.004686940 0.004990699 0.005924282 0.006389325 0.006435227
383 417 427 455 466 489
0.006875256 0.004943798 0.004867913 0.009165489 0.006894948 0.007172970
516 557 600 607 708 722
0.010652503 0.005554653 0.005521417 0.005900775 0.004918820 0.005283197
730 747 749 765 770 806
0.005147385 0.005767084 0.005739296 0.005627857 0.004853531 0.005540805
828 873 899 912 945 957
0.007828147 0.006995196 0.005410461 0.009841709 0.005463074 0.011974020
965 995 1010 1037 1055 1114
0.005623028 0.014123202 0.004921736 0.006212390 0.006141910 0.005265677
1132 1139 1140 1152 1169 1176
0.006966068 0.005959266 0.006430522 0.006059376 0.004777701 0.005844926
1180 1188 1241 1255 1265 1300
0.012489770 0.004953075 0.004856461 0.004777701 0.005672314 0.006081786
1304 1306 1325 1345 1375 1427
0.005100154 0.005735362 0.005574241 0.004711974 0.006383030 0.004930082
1467 1479 1481 1482 1493 1495
0.006002690 0.006720302 0.005242150 0.014123202 0.007969759 0.004698112
1507 1518 1527 1528 1612 1614
0.004854829 0.005124711 0.005382525 0.005299431 0.009822894 0.008993399
1638 1642 1649 1667 1711 1719
0.007179770 0.007952008 0.005823824 0.007088896 0.004814463 0.004889259
1768 1801 1829 1845 1846 1873
0.004851478 0.004848096 0.004766974 0.006026022 0.005410441 0.005408285
1882 1887 1891 1894 1929 1933
0.004657424 0.005524176 0.005158734 0.004960904 0.007033838 0.006340479
1935 1941 1956 1979 2002 2004
0.005367322 0.005155488 0.004781888 0.006023785 0.005565063 0.013293498
2019 2097 2106 2120 2135 2137
0.005674515 0.004945639 0.005391888 0.005466349 0.006032961 0.005458302
2144 2148 2155 2165 2177 2203
0.005593131 0.020382883 0.005008204 0.005477897 0.005419422 0.004636068
2211 2218 2232 2270 2281 2294
0.004669295 0.019421769 0.006338349 0.007220018 0.006915245 0.006786840
2305 2306 2317 2350 2353 2361
0.010603689 0.005150988 0.011839638 0.004828941 0.012864370 0.005000134
2371 2379 2383 2391 2396 2400
0.005408758 0.005477511 0.006524792 0.005514256 0.008839432 0.007088896
2410 2420 2434 2461 2469 2494
0.007468644 0.004680633 0.020272029 0.016139547 0.005146048 0.008984307
2497 2525 2602 2623 2650 2663
0.006080078 0.006378263 0.007252053 0.006726745 0.005745347 0.006470764
2680 2688 2699 2772 2777 2779
0.007158112 0.005672314 0.007016065 0.013464526 0.012846622 0.006384951
2798 2801 2812 2840 2849 2852
0.004748024 0.006435227 0.007487716 0.008115731 0.006556965 0.004882351
2856 2900 2910 2912 2946 2948
0.006384951 0.005156491 0.006091693 0.004787344 0.005054188 0.004801962
2965 2969 2971 2990 2994 2995
0.005054630 0.005674515 0.004891501 0.005016018 0.005494005 0.009165489
3004 3008 3012 3018 3021 3030
0.004821646 0.005616506 0.005184523 0.007585533 0.007383966 0.008916163
3041 3046 3051 3052 3065 3074
0.016455429 0.005893423 0.008984307 0.005310204 0.012183875 0.004946746
3126 3130 3155 3188 3221 3274
0.019492710 0.005384397 0.007093095 0.015836977 0.005940855 0.005995139
3292 3309 3324 3332 3347 3367
0.004646160 0.005018396 0.005633797 0.008055287 0.007555507 0.005219474
3378 3381 3413 3418 3419 3446
0.005844088 0.007215437 0.007952008 0.004995318 0.005468012 0.004726109
3469 3474 3487 3496 3519 3524
0.005565027 0.004674916 0.005068929 0.009369520 0.005543214 0.004635661
3526 3545 3549 3557 3587 3611
0.013422322 0.004757291 0.004918820 0.007684917 0.004872856 0.004872856
3622 3650 3666 3709 3710 3720
0.004733307 0.005857163 0.006405776 0.006141910 0.005615658 0.006389325
3721 3727 3741 3752 3763 3769
0.016139547 0.006618658 0.004870547 0.016330934 0.005287558 0.004728145
3787 3792 3807 3818 3820 3823
0.004943798 0.007689718 0.005998108 0.010775791 0.007366569 0.323058874
3824 3835 3843 3848 3854 3863
0.012258040 0.004967080 0.005005760 0.006333501 0.007689718 0.008161154
3890 3895 3912 3917 3922 3953
0.005616506 0.019354101 0.004912963 0.007690767 0.010025916 0.019421769
3991 4024 4035 4071 4088 4112
0.004912256 0.005151978 0.007969759 0.033246417 0.004918863 0.007559978
4125 4129 4131 4132 4140 4143
0.005498808 0.004805200 0.007131721 0.019492710 0.009613280 0.004686940
4174 4180 4191 4195 4260 4287
0.006136951 0.005524176 0.005514261 0.005146048 0.009759507 0.004801486
4300 4301 4343 4362 4401 4463
0.005225384 0.005310204 0.007165155 0.005507166 0.005566348 0.017712588
4482 4493 4511 4512 4547 4553
0.008767404 0.007154999 0.006995196 0.005054630 0.009347506 0.010326676
4558 4585 4627 4633 4639 4687
0.006875101 0.004851478 0.014885235 0.005546699 0.004777701 0.006998472
4693 4696 4710 4715 4754 4759
0.004640983 0.005081857 0.005440630 0.005196371 0.006386644 0.007418423
4790 4829 4873 4902 4908 4972
0.004856461 0.006130203 0.012769233 0.005893423 0.008892413 0.004912256
4992 5040 5041 5054 5087 5107
0.006275606 0.004889259 0.007383966 0.006838011 0.004875721 0.006541276
5154 5178 5187
0.007677790 0.006034090 0.005565063
reduced_4.res <- reduced_4$residuals
crit<-qt(1-0.05/(2*n), n-p-1)
outliers <- reduced_4.res[abs(reduced_4.res)>crit]
outliers
8 12 21 30 83 85
-5.913799 -112.810937 -5.131140 -28.388375 7.064053 -4.902421
86 101 115 122 142 148
5.268800 -6.738462 5.835978 -11.281066 -4.900743 -5.950818
149 154 166 183 202 206
-8.284466 -4.614577 5.536043 18.759111 -11.064846 -6.976696
218 245 264 281 288 372
7.906909 -7.166119 6.562536 4.514517 -9.756502 -9.170360
373 440 505 509 525 548
-4.811386 -4.829656 10.431685 -5.199469 32.192103 -42.498339
566 589 629 655 657 659
10.705696 -4.616565 -5.057653 4.903313 -10.049213 -4.454203
675 687 701 706 717 721
-5.471326 -6.529182 -9.150344 -8.992792 -7.819105 -8.206050
722 725 739 751 768 774
-11.839559 -6.756563 4.469453 9.211716 -5.691134 -4.977692
788 794 795 817 819 833
5.293240 -5.395176 -9.673765 -13.711257 -4.723057 -10.325351
840 844 875 888 902 903
-4.977692 7.896471 -13.765446 4.511825 -4.486556 4.815690
917 954 964 967 991 1039
-9.997107 11.645455 -5.057653 7.233862 -4.851860 -29.863426
1077 1081 1107 1124 1133 1156
-6.559467 -5.979445 -11.538933 -9.743607 -8.597773 -358.992053
1162 1167 1171 1172 1226 1235
-5.913799 -13.370172 -5.991466 -13.904782 -24.686598 -22.516690
1257 1258 1263 1270 1305 1318
9.644487 4.622980 6.011786 8.044160 -5.145541 -5.100743
1344 1356 1387 1412 1428 1436
5.658642 -6.254700 -4.851860 -9.750690 -11.498443 -8.316040
1487 1497 1510 1524 1610 1620
7.687631 5.004046 -7.060659 4.482981 -16.343618 -5.238351
1652 1658 1665 1669 1693 1706
-11.418067 -7.852930 -5.163340 -16.484320 -4.689106 -4.877840
1776 1787 1790 1840 1930 1951
-16.318931 -5.613721 9.189242 -4.798258 -6.274087 -7.515991
1958 1998 2004 2006 2018 2040
-6.682392 9.251057 4.599440 5.240035 7.500553 4.734365
2041 2059 2079 2115 2119 2148
-5.264313 -15.336442 -6.043512 -23.388238 -4.859254 -6.605338
2156 2167 2195 2205 2220 2256
-5.287846 -9.600193 -5.857549 -4.579628 4.948918 -9.053372
2258 2262 2284 2290 2313 2329
17.356606 -7.264594 -16.484320 5.684278 -4.851860 -7.041619
2334 2335 2337 2386 2439 2452
-7.417245 -4.555771 -5.816303 -12.954904 9.264406 -8.282663
2480 2485 2506 2521 2536 2579
-4.793832 -18.780311 4.541019 -16.318931 -4.884719 9.211716
2594 2628 2629 2642 2660 2661
-5.238351 -6.381604 4.489202 -5.096416 4.453651 -5.461202
2666 2696 2702 2718 2727 2735
-6.254700 -6.976696 7.233862 -6.726264 5.198906 -6.197960
2745 2763 2772 2781 2793 2795
-6.606402 -11.892705 4.431016 -8.362063 -4.697738 -4.530220
2802 2840 2866 2892 2898 2955
-7.010108 4.976958 -7.349414 11.007314 -4.528817 -14.101843
2978 2994 2997 3005 3025 3040
-13.426497 6.463923 -14.095802 5.659028 -4.737794 -8.881971
3042 3065 3072 3079 3090 3105
-6.738462 -7.313679 -6.423514 16.073667 5.085934 -5.548543
3133 3149 3196 3210 3243 3248
-5.466040 -5.426271 -6.355608 -7.747739 -6.033627 5.580542
3257 3264 3322 3336 3343 3351
-4.595999 -5.857549 -9.308640 9.264406 -4.501913 5.867902
3362 3369 3400 3410 3414 3424
-13.219174 -12.293034 -4.896657 -5.318501 -9.993826 -6.001601
3434 3461 3522 3564 3566 3571
-10.863890 -6.353654 -8.866347 -8.604661 -6.119343 -6.889088
3582 3605 3629 3638 3711 3716
-5.322449 -9.260519 -8.348111 -6.028213 -6.506898 5.779253
3740 3750 3784 3816 3841 3853
-7.586267 5.060560 -4.746179 -6.432015 -6.569981 5.240035
3855 3866 3903 3910 3913 3915
-10.826648 -4.935304 5.739331 -9.056519 -4.882192 -16.323382
3942 3947 3951 4003 4019 4090
-5.403135 -4.485039 -6.092031 -8.881971 -4.877840 -4.542401
4093 4111 4116 4127 4144 4145
-5.913799 -15.362967 -5.909070 -7.854482 4.541019 -8.716350
4151 4156 4159 4170 4233 4243
-9.593344 -7.055209 -42.578664 -4.837952 -4.614577 -5.850980
4248 4306 4311 4329 4354 4356
-14.114484 -5.627580 4.678171 -11.261020 -14.672621 -5.417642
4460 4490 4503 4505 4527 4544
6.046933 -5.950818 -6.716718 4.453651 4.477563 -8.992792
4547 4556 4580 4610 4614 4627
-4.960846 -4.813642 -13.243536 5.353589 -5.833289 -4.965536
4650 4656 4659 4677 4701 4713
-4.896657 4.644296 -6.288973 5.012168 -10.498461 8.507914
4729 4783 4794 4815 4858 4880
-6.726264 7.580155 -5.470226 -5.334876 -5.096416 -6.738462
4887 4910 4924 4939 4988 5045
5.612130 -6.821823 -4.543092 4.488823 -6.821823 -11.196377
5049 5051 5091 5095 5133 5140
-4.972743 -6.197392 -8.362063 -7.809710 -5.399943 4.815690
5179 5195
-8.541941 -10.162346
## outliers removed
outliers_index <- attr(outliers, "names")
outliers_index <- as.numeric(outliers_index)
train_no_outliers <- train[-(outliers_index),]
#leverages removed
lererages_index <- attr(leverages, "names")
lererages_index <- as.numeric(lererages_index)
train_no_leverages <- train[-(lererages_index),]
# DDFFITS_influence
DDFFITS_index <- attr(DDFFITS_influence, "names")
DDFFITS_index <- as.numeric(DDFFITS_index)
train_no_DDFFITS <- train[-(DDFFITS_index),]
# all "non-normal" removed
all_special <- c(DDFFITS_index,lererages_index,outliers_index)
train_nothing_special <- train[-(all_special),]
train_nothing_special
NA
NA
NA
NA
vif(train[c(2,3,4,7,8,6,9,10,11)])
volatile.acidity citric.acid residual.sugar
1.818602 1.505238 3.353752
total.sulfur.dioxide density free.sulfur.dioxide
2.850601 5.664883 2.099196
pH sulphates alcohol
1.338240 1.427424 2.890129
train_temp<-train
# as.factor(train_temp$Type)<-numeric(train_temp$Type)
#train_temp
# as.factor
train_temp$Type <- as.numeric(train_temp$Type)-1
train_temp$Type <- as.integer(train_temp$Type)
train_temp
reduced_4_3 <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_no_outliers)
reduced_4_4_lev <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_no_leverages)
reduced_4_5_DDFFITS <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_no_DDFFITS)
reduced_4_6_no_special <- glm(cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type, family=binomial, data=train_nothing_special)
# summary(reduced_4_6_no_special)
## checking colinearity / VIF scores
# reduced_4_3_col <- data.frame('reduced_4_3' = check_collinearity(reduced_4_3))
# reduced_4_3_col_VIF <- reduced_4_3_col[c('reduced_4_3.Term','reduced_4_3.VIF')]
# reduced_4_3_col_VIF
#
# reduced_4_4_lev_col <- data.frame('reduced_4_4_lev' = check_collinearity(reduced_4_4_lev))
# reduced_4_4_lev_col_VIF <- reduced_4_4_lev_col[c('reduced_4_4_lev.Term','reduced_4_4_lev.VIF')]
# reduced_4_4_lev_col_VIF
#
#
# reduced_4_5_DDFFITS_col <- data.frame('reduced_4_5_DDFFITS' = check_collinearity(reduced_4_5_DDFFITS))
# reduced_4_5_DDFFITS_col_VIF <- reduced_4_5_DDFFITS_col[c('reduced_4_5_DDFFITS.Term','reduced_4_5_DDFFITS.VIF')]
# reduced_4_5_DDFFITS_col_VIF
#
# reduced_4_6_no_special_col <- data.frame('reduced_4_6_no_special' = check_collinearity(reduced_4_6_no_special))
# reduced_4_6_no_special_col_VIF <- reduced_4_6_no_special_col[c('reduced_4_6_no_special.Term','reduced_4_6_no_special.VIF')]
# reduced_4_6_no_special_col_VIF
#
# VIF_summary <- data.frame('0'=reduced_4_3_col_VIF['reduced_4_3.Term'],
# '1'=reduced_4_3_col_VIF['reduced_4_3.VIF'],
# '2'=reduced_4_4_lev_col_VIF['reduced_4_4_lev.VIF'],
# '3'=reduced_4_5_DDFFITS_col_VIF['reduced_4_5_DDFFITS.VIF'],
# '4'=reduced_4_6_no_special_col_VIF['reduced_4_6_no_special.VIF'])
# colnames(VIF_summary) <- c('Predictor Variable','4_3.VIF.Outliers','4_4_lev.VIF','4_5_DDFFITS.VIF','4_6_no_special.VIF')
# VIF_summary
## VIF for Outliers
### cat_quality~volatile.acidity+citric.acid+residual.sugar+total.sulfur.dioxide+density+free.sulfur.dioxide+pH+sulphates+alcohol+Type
#
reg_4_VIF_test <- vif(train_temp[c(1,2,3,4,7,8,6,9,10,11,12)])
reg_4_2_VIF_test <- vif(train_temp[c(2,3,4,7,8,6,9,10,11,12)])
outliers_VIF <- vif(train_no_outliers[c(2,3,4,7,8,6,9,10,11,12)])
leverage_VIF <- vif(train_no_leverages[c(2,3,4,7,8,6,9,10,11,12)])
DDFFITS_VIF <- vif(train_no_DDFFITS[c(2,3,4,7,8,6,9,10,11,12)])
nothing_special <- vif(train_nothing_special[c(2,3,4,7,8,6,9,10,11,12)])
reg_4_VIF_test
fixed.acidity volatile.acidity citric.acid residual.sugar total.sulfur.dioxide
4.962487 2.139008 1.585795 9.280982 3.974170
density free.sulfur.dioxide pH sulphates alcohol
21.338984 2.189173 2.483275 1.514014 5.371286
Type
7.045601
VIF_summary_test <- data.frame('best_possible_VIF (post)'=reg_4_2_VIF_test,
'outliers_VIF'=outliers_VIF,
'leverage_VIF'=leverage_VIF,
'DDFFITS_VIF'= DDFFITS_VIF,
'nothing_special'=nothing_special)
VIF_summary_test
NA
##evaluating model
Reduced4_3_AIC_train <- reduced_4_3$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_3,newdata=test, type="response")
reduced_4_3_error <- table(test$cat_quality, preds>0.6)
evulation_summary_4_3 <- data.frame(
attempt = 'reduced_4_3_error_outliers',
AIC = Reduced4_3_AIC_train,
PRESS = get_press(reduced_4_3),
'False positive' = round(reduced_4_3_error[3]/(reduced_4_3_error[1]+reduced_4_3_error[3]),3),
'False negative' = round(reduced_4_3_error[2]/(reduced_4_3_error[2]+reduced_4_3_error[4]),3),
'Error Rate' = round((reduced_4_3_error[2]+reduced_4_3_error[3])/(reduced_4_3_error[1]+reduced_4_3_error[2]+reduced_4_3_error[3]+reduced_4_3_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_3)
evulation_summary
##evaluating model leverage
reduced_4_4_lev_AIC_train <- reduced_4_4_lev$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_4_lev,newdata=test, type="response")
reduced_4_4_lev_error <- table(test$cat_quality, preds>0.65)
evulation_summary_4_4_lev <- data.frame(
attempt = 'reduced_4_4_lev_error',
AIC = reduced_4_4_lev_AIC_train,
PRESS = get_press(reduced_4_4_lev),
'False positive' = round(reduced_4_4_lev_error[3]/(reduced_4_4_lev_error[1]+reduced_4_4_lev_error[3]),3),
'False negative' = round(reduced_4_4_lev_error[2]/(reduced_4_4_lev_error[2]+reduced_4_4_lev_error[4]),3),
'Error Rate' = round((reduced_4_4_lev_error[2]+reduced_4_4_lev_error[3])/(reduced_4_4_lev_error[1]+reduced_4_4_lev_error[2]+reduced_4_4_lev_error[3]+reduced_4_4_lev_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_4_lev)
evulation_summary
##evaluating model DDFFITS
reduced_4_5_DDFFITS_AIC_train <- reduced_4_5_DDFFITS$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_5_DDFFITS,newdata=test, type="response")
reduced_4_5_DDFFITS_error <- table(test$cat_quality, preds>0.7)
evulation_summary_4_5_DDFFITS <- data.frame(
attempt = 'reduced_4_5_DDFFITS_error',
AIC = reduced_4_5_DDFFITS_AIC_train,
PRESS = get_press(reduced_4_5_DDFFITS),
'False positive' = round(reduced_4_5_DDFFITS_error[3]/(reduced_4_5_DDFFITS_error[1]+reduced_4_5_DDFFITS_error[3]),3),
'False negative' = round(reduced_4_5_DDFFITS_error[2]/(reduced_4_5_DDFFITS_error[2]+reduced_4_5_DDFFITS_error[4]),3),
'Error Rate' = round((reduced_4_5_DDFFITS_error[2]+reduced_4_5_DDFFITS_error[3])/(reduced_4_5_DDFFITS_error[1]+reduced_4_5_DDFFITS_error[2]+reduced_4_5_DDFFITS_error[3]+reduced_4_5_DDFFITS_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_5_DDFFITS)
evulation_summary
##evaluating model DDFFITS
reduced_4_6_no_special_AIC_train <- reduced_4_6_no_special$aic
##predicted quality for test data based on training data
preds<-predict(reduced_4_6_no_special,newdata=test, type="response")
reduced_4_6_no_special_error <- table(test$cat_quality, preds>0.8)
evulation_summary_4_6_no_special <- data.frame(
attempt = 'reduced_4_6_no_special_error',
AIC = reduced_4_6_no_special_AIC_train,
PRESS = get_press(reduced_4_6_no_special),
'False positive' = round(reduced_4_6_no_special_error[3]/(reduced_4_6_no_special_error[1]+reduced_4_6_no_special_error[3]),3),
'False negative' = round(reduced_4_6_no_special_error[2]/(reduced_4_6_no_special_error[2]+reduced_4_6_no_special_error[4]),3),
'Error Rate' = round((reduced_4_6_no_special_error[2]+reduced_4_6_no_special_error[3])/(reduced_4_6_no_special_error[1]+reduced_4_6_no_special_error[2]+reduced_4_6_no_special_error[3]+reduced_4_6_no_special_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_4_6_no_special)
evulation_summary
## reduced_1
# detach(package:performance, unload=TRUE)
## FYI the performance package causes ROC curves to not work
library(ROCR)
# reduced_1
preds<-predict(reduced_1,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_1")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_1_auc <- auc@y.values
## reduced_4
preds<-predict(reduced_4,newdata=test, type="response")
rates4<-prediction(preds, test$cat_quality)
roc_result<-performance(rates4,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4")
lines(x = c(0,1), y = c(0,1), col="red")
auc4<-performance(rates4, measure = "auc")
reduced_4_auc <- auc4@y.values
## reduced_4_2
preds<-predict(reduced_4_2,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_2")
lines(x = c(0,1), y = c(0,1), col="red")
auc4_2<-performance(rates, measure = "auc")
reduced_4_2_auc <- auc4_2@y.values
## reduced_4_3
preds<-predict(reduced_4_3,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_3")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_3_auc <- auc@y.values
## reduced_4_4_lev
preds<-predict(reduced_4_4_lev,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_4_lev")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_4_lev_auc <- auc@y.values
## reduced_4_5_DDFFITS
preds<-predict(reduced_4_5_DDFFITS,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_5_DDFFITS")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_5_DDFFITS_auc <- auc@y.values
## reduced_4_6_no_special
preds<-predict(reduced_4_6_no_special,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_6_no_special")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_6_no_special_auc <- auc@y.values
AUC_summary <- data.frame('reduced_1'=reduced_1_auc,
'reduced_4'=reduced_4_auc,
'reduced_4_2'=reduced_4_2_auc,
'reduced_4_3'=reduced_4_3_auc,
'reduced_4_4_lev'=reduced_4_4_lev_auc,
'reduced_4_5_DDFFITS'=reduced_4_5_DDFFITS_auc,
'reduced_4_6_no_special'=reduced_4_6_no_special_auc)
colnames(AUC_summary) <- c('reduced_1','reduced_4','reduced_4_2','reduced_4_3','reduced_4_4_lev','reduced_4_5_DDFFITS','reduced_4_6_no_special')
AUC_summary
regfull_Red<-glm(cat_quality~., family="binomial", data=train_Red_NoType)
regnull_Red<-glm(cat_quality~1, family="binomial", data=train_Red_NoType)
step(regnull_Red, scope=list(lower=regnull_Red, upper=regfull_Red), direction="forward")
Start: AIC=1811.89
cat_quality ~ 1
Df Deviance AIC
+ alcohol 1 1540.6 1544.6
+ volatile.acidity 1 1680.6 1684.6
+ total.sulfur.dioxide 1 1726.9 1730.9
+ sulphates 1 1741.1 1745.1
+ citric.acid 1 1778.7 1782.7
+ density 1 1780.0 1784.0
+ chlorides 1 1795.4 1799.4
+ fixed.acidity 1 1797.8 1801.8
+ free.sulfur.dioxide 1 1803.8 1807.8
<none> 1809.9 1811.9
+ pH 1 1809.8 1813.8
+ residual.sugar 1 1809.9 1813.9
Step: AIC=1544.61
cat_quality ~ alcohol
Df Deviance AIC
+ volatile.acidity 1 1450.7 1456.7
+ sulphates 1 1492.7 1498.7
+ total.sulfur.dioxide 1 1507.4 1513.4
+ fixed.acidity 1 1521.6 1527.6
+ citric.acid 1 1523.5 1529.5
+ pH 1 1528.7 1534.7
+ density 1 1535.8 1541.8
<none> 1540.6 1544.6
+ free.sulfur.dioxide 1 1539.2 1545.2
+ residual.sugar 1 1539.8 1545.8
+ chlorides 1 1540.5 1546.5
Step: AIC=1456.66
cat_quality ~ alcohol + volatile.acidity
Df Deviance AIC
+ total.sulfur.dioxide 1 1417.1 1425.1
+ sulphates 1 1427.5 1435.5
+ fixed.acidity 1 1448.1 1456.1
+ free.sulfur.dioxide 1 1448.1 1456.1
<none> 1450.7 1456.7
+ citric.acid 1 1449.0 1457.0
+ density 1 1449.5 1457.5
+ residual.sugar 1 1449.8 1457.8
+ pH 1 1449.9 1457.9
+ chlorides 1 1450.7 1458.7
Step: AIC=1425.14
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide
Df Deviance AIC
+ sulphates 1 1387.4 1397.4
+ free.sulfur.dioxide 1 1409.8 1419.8
<none> 1417.1 1425.1
+ pH 1 1415.8 1425.8
+ density 1 1416.1 1426.1
+ fixed.acidity 1 1416.2 1426.2
+ citric.acid 1 1416.8 1426.8
+ residual.sugar 1 1417.0 1427.0
+ chlorides 1 1417.1 1427.1
Step: AIC=1397.37
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates
Df Deviance AIC
+ free.sulfur.dioxide 1 1379.6 1391.6
+ chlorides 1 1380.0 1392.0
+ citric.acid 1 1383.9 1395.9
<none> 1387.4 1397.4
+ residual.sugar 1 1387.1 1399.1
+ fixed.acidity 1 1387.3 1399.3
+ pH 1 1387.4 1399.4
+ density 1 1387.4 1399.4
Step: AIC=1391.64
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide
Df Deviance AIC
+ chlorides 1 1372.6 1386.6
<none> 1379.6 1391.6
+ citric.acid 1 1377.9 1391.9
+ pH 1 1379.3 1393.3
+ fixed.acidity 1 1379.3 1393.3
+ residual.sugar 1 1379.5 1393.5
+ density 1 1379.6 1393.6
Step: AIC=1386.63
cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides
Df Deviance AIC
<none> 1372.6 1386.6
+ pH 1 1371.2 1387.2
+ citric.acid 1 1372.1 1388.1
+ fixed.acidity 1 1372.2 1388.2
+ residual.sugar 1 1372.2 1388.2
+ density 1 1372.6 1388.6
Call: glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
Coefficients:
(Intercept) alcohol volatile.acidity
-8.02155 0.85512 -2.93115
total.sulfur.dioxide sulphates free.sulfur.dioxide
-0.01843 2.63739 0.02326
chlorides
-4.10141
Degrees of Freedom: 1308 Total (i.e. Null); 1302 Residual
Null Deviance: 1810
Residual Deviance: 1373 AIC: 1387
The model looks great after the foward selection! Time to test and add to the evaluation summary.
model1_Red<-glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
summary(model1_Red)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + total.sulfur.dioxide +
sulphates + free.sulfur.dioxide + chlorides, family = "binomial",
data = train_Red_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0655 -0.8628 0.3239 0.8474 2.3082
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.021553 0.886912 -9.044 < 2e-16 ***
alcohol 0.855123 0.078409 10.906 < 2e-16 ***
volatile.acidity -2.931148 0.413395 -7.090 1.34e-12 ***
total.sulfur.dioxide -0.018428 0.002926 -6.297 3.03e-10 ***
sulphates 2.637391 0.456973 5.771 7.86e-09 ***
free.sulfur.dioxide 0.023257 0.008608 2.702 0.00689 **
chlorides -4.101411 1.588464 -2.582 0.00982 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1809.9 on 1308 degrees of freedom
Residual deviance: 1372.6 on 1302 degrees of freedom
AIC: 1386.6
Number of Fisher Scoring iterations: 4
##evaluating model
model1_Red_AIC_train <- model1_Red$aic
##predicted quality for test data based on training data
test_Red_NoType<-subset(test, Type == 0, select=-c(Type))
preds<-predict(model1_Red,newdata=test_Red_NoType, type="response")
model1_Red_error <- table(test_Red_NoType$cat_quality, preds>0.7)
#Curves
evulation_summary_1R <- data.frame(
attempt = 'model1_Red',
AIC = model1_Red_AIC_train,
PRESS = get_press(model1_Red),
'False positive' = round(model1_Red_error[3]/(model1_Red_error[1]+model1_Red_error[3]),3),
'False negative' = round(model1_Red_error[2]/(model1_Red_error[2]+model1_Red_error[4]),3),
'Error Rate' = round((model1_Red_error[2]+model1_Red_error[3])/(model1_Red_error[1]+model1_Red_error[2]+model1_Red_error[3]+model1_Red_error[4]),3)
)
compare_models<-rbind(evulation_summary[1,],evulation_summary_1R)
compare_models
evulation_summary <- rbind(evulation_summary,evulation_summary_1R)
evulation_summary
NA
# model1_Red
library(ROCR)
preds<-predict(model1_Red,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_Red")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_Red_auc <- auc@y.values
regfull_White<-glm(cat_quality~., family="binomial", data=train_White_NoType)
regnull_White<-glm(cat_quality~1,family="binomial", data=train_White_NoType)
step(regnull_White, scope=list(lower=regnull_White, upper=regfull_White), direction="forward")
Start: AIC=4947.36
cat_quality ~ 1
Df Deviance AIC
+ alcohol 1 4280.3 4284.3
+ density 1 4668.2 4672.2
+ volatile.acidity 1 4758.7 4762.7
+ chlorides 1 4805.4 4809.4
+ total.sulfur.dioxide 1 4831.9 4835.9
+ fixed.acidity 1 4912.0 4916.0
+ pH 1 4914.3 4918.3
+ residual.sugar 1 4919.1 4923.1
+ sulphates 1 4934.7 4938.7
<none> 4945.4 4947.4
+ free.sulfur.dioxide 1 4945.2 4949.2
+ citric.acid 1 4945.4 4949.4
Step: AIC=4284.29
cat_quality ~ alcohol
Df Deviance AIC
+ volatile.acidity 1 4030.1 4036.1
+ residual.sugar 1 4214.8 4220.8
+ free.sulfur.dioxide 1 4235.1 4241.1
+ density 1 4253.7 4259.7
+ sulphates 1 4261.3 4267.3
+ fixed.acidity 1 4267.3 4273.3
+ chlorides 1 4273.1 4279.1
+ pH 1 4276.2 4282.2
+ citric.acid 1 4276.3 4282.3
<none> 4280.3 4284.3
+ total.sulfur.dioxide 1 4279.4 4285.4
Step: AIC=4036.1
cat_quality ~ alcohol + volatile.acidity
Df Deviance AIC
+ residual.sugar 1 3942.1 3950.1
+ density 1 3984.6 3992.6
+ free.sulfur.dioxide 1 3999.3 4007.3
+ sulphates 1 4013.2 4021.2
+ fixed.acidity 1 4014.8 4022.8
+ total.sulfur.dioxide 1 4020.2 4028.2
<none> 4030.1 4036.1
+ pH 1 4028.3 4036.3
+ chlorides 1 4028.6 4036.6
+ citric.acid 1 4030.1 4038.1
Step: AIC=3950.11
cat_quality ~ alcohol + volatile.acidity + residual.sugar
Df Deviance AIC
+ fixed.acidity 1 3923.0 3933.0
+ sulphates 1 3924.2 3934.2
+ free.sulfur.dioxide 1 3930.2 3940.2
+ density 1 3932.9 3942.9
+ pH 1 3934.3 3944.3
<none> 3942.1 3950.1
+ total.sulfur.dioxide 1 3941.1 3951.1
+ citric.acid 1 3941.6 3951.6
+ chlorides 1 3942.1 3952.1
Step: AIC=3932.95
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity
Df Deviance AIC
+ sulphates 1 3904.7 3916.7
+ free.sulfur.dioxide 1 3914.0 3926.0
<none> 3923.0 3933.0
+ pH 1 3921.5 3933.5
+ total.sulfur.dioxide 1 3921.7 3933.7
+ density 1 3921.9 3933.9
+ citric.acid 1 3922.7 3934.7
+ chlorides 1 3923.0 3935.0
Step: AIC=3916.74
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates
Df Deviance AIC
+ free.sulfur.dioxide 1 3896.9 3910.9
+ density 1 3898.2 3912.2
<none> 3904.7 3916.7
+ pH 1 3904.6 3918.6
+ total.sulfur.dioxide 1 3904.6 3918.6
+ citric.acid 1 3904.7 3918.7
+ chlorides 1 3904.7 3918.7
Step: AIC=3910.91
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates + free.sulfur.dioxide
Df Deviance AIC
+ density 1 3890.7 3906.7
+ total.sulfur.dioxide 1 3894.7 3910.7
<none> 3896.9 3910.9
+ pH 1 3896.8 3912.8
+ chlorides 1 3896.9 3912.9
+ citric.acid 1 3896.9 3912.9
Step: AIC=3906.7
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates + free.sulfur.dioxide + density
Df Deviance AIC
+ pH 1 3885.2 3903.2
<none> 3890.7 3906.7
+ total.sulfur.dioxide 1 3889.6 3907.6
+ chlorides 1 3890.7 3908.7
+ citric.acid 1 3890.7 3908.7
Step: AIC=3903.24
cat_quality ~ alcohol + volatile.acidity + residual.sugar + fixed.acidity +
sulphates + free.sulfur.dioxide + density + pH
Df Deviance AIC
<none> 3885.2 3903.2
+ total.sulfur.dioxide 1 3884.3 3904.3
+ chlorides 1 3885.0 3905.0
+ citric.acid 1 3885.1 3905.1
Call: glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
fixed.acidity + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
Coefficients:
(Intercept) alcohol volatile.acidity
2.041e+02 8.487e-01 -6.416e+00
residual.sugar fixed.acidity sulphates
1.563e-01 -9.407e-03 1.877e+00
free.sulfur.dioxide density pH
6.778e-03 -2.165e+02 8.920e-01
Degrees of Freedom: 3887 Total (i.e. Null); 3879 Residual
Null Deviance: 4945
Residual Deviance: 3885 AIC: 3903
The model looks good after the foward selection, but the predictor fixed.acidity can be removed. The density VIF is above ten, but jsut barely. For now, it will be left in. Time to test and add to the evaluation summary.
model1_White<-glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
fixed.acidity + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
summary(model1_White)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
fixed.acidity + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.198 -0.888 0.437 0.798 2.507
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.041e+02 6.993e+01 2.918 0.00352 **
alcohol 8.487e-01 9.686e-02 8.763 < 2e-16 ***
volatile.acidity -6.416e+00 4.477e-01 -14.329 < 2e-16 ***
residual.sugar 1.563e-01 2.746e-02 5.689 1.27e-08 ***
fixed.acidity -9.407e-03 7.582e-02 -0.124 0.90126
sulphates 1.877e+00 4.028e-01 4.661 3.15e-06 ***
free.sulfur.dioxide 6.778e-03 2.540e-03 2.669 0.00761 **
density -2.165e+02 7.088e+01 -3.054 0.00226 **
pH 8.920e-01 3.886e-01 2.295 0.02172 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4945.4 on 3887 degrees of freedom
Residual deviance: 3885.2 on 3879 degrees of freedom
AIC: 3903.2
Number of Fisher Scoring iterations: 5
model1_White<-glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar + sulphates + free.sulfur.dioxide + density +
pH, family = "binomial", data = train_White_NoType)
summary(model1_White)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + residual.sugar +
sulphates + free.sulfur.dioxide + density + pH, family = "binomial",
data = train_White_NoType)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.1956 -0.8867 0.4377 0.7961 2.5070
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.105e+02 4.762e+01 4.420 9.87e-06 ***
alcohol 8.408e-01 7.311e-02 11.501 < 2e-16 ***
volatile.acidity -6.407e+00 4.417e-01 -14.504 < 2e-16 ***
residual.sugar 1.586e-01 1.972e-02 8.045 8.66e-16 ***
sulphates 1.885e+00 3.978e-01 4.739 2.14e-06 ***
free.sulfur.dioxide 6.796e-03 2.536e-03 2.680 0.00737 **
density -2.230e+02 4.773e+01 -4.673 2.97e-06 ***
pH 9.247e-01 2.859e-01 3.235 0.00122 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 4945.4 on 3887 degrees of freedom
Residual deviance: 3885.3 on 3880 degrees of freedom
AIC: 3901.3
Number of Fisher Scoring iterations: 5
##evaluating model
model1_White_AIC_train <- model1_White$aic
##predicted quality for test data based on training data
test_White_NoType<-subset(test, Type == 1, select=-c(Type))
preds<-predict(model1_White,newdata=test_White_NoType, type="response")
model1_White_error <- table(test_White_NoType$cat_quality, preds>0.7)
#Curves
evulation_summary_1W <- data.frame(
attempt = 'model1_White',
AIC = model1_White_AIC_train,
PRESS = get_press(model1_White),
'False positive' = round(model1_White_error[3]/(model1_White_error[1]+model1_White_error[3]),3),
'False negative' = round(model1_White_error[2]/(model1_White_error[2]+model1_White_error[4]),3),
'Error Rate' = round((model1_White_error[2]+model1_White_error[3])/(model1_White_error[1]+model1_White_error[2]+model1_White_error[3]+model1_White_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_1W)
evulation_summary
compare_models<-rbind(compare_models,evulation_summary_1W)
compare_models
NA
regfull_int<-glm(cat_quality~.*Type, family="binomial", data=train)
regnull_int<-glm(cat_quality~1,family="binomial", data=train)
step(regnull_int, scope=list(lower=regnull_int, upper=regfull_int), direction="forward")
Start: AIC=6835.15
cat_quality ~ 1
Df Deviance AIC
+ alcohol 1 5904.3 5908.3
+ density 1 6450.6 6454.6
+ volatile.acidity 1 6468.6 6472.6
+ chlorides 1 6640.0 6644.0
+ Type 1 6755.3 6759.3
+ citric.acid 1 6801.3 6805.3
+ fixed.acidity 1 6807.6 6811.6
+ free.sulfur.dioxide 1 6822.7 6826.7
+ total.sulfur.dioxide 1 6823.5 6827.5
+ sulphates 1 6827.8 6831.8
+ residual.sugar 1 6830.8 6834.8
<none> 6833.2 6835.2
+ pH 1 6831.4 6835.4
Step: AIC=5908.3
cat_quality ~ alcohol
Df Deviance AIC
+ volatile.acidity 1 5539.7 5545.7
+ residual.sugar 1 5781.4 5787.4
+ free.sulfur.dioxide 1 5810.8 5816.8
+ Type 1 5826.6 5832.6
+ chlorides 1 5861.3 5867.3
+ citric.acid 1 5864.8 5870.8
+ total.sulfur.dioxide 1 5872.3 5878.3
+ fixed.acidity 1 5893.0 5899.0
+ pH 1 5895.5 5901.5
+ sulphates 1 5897.6 5903.6
<none> 5904.3 5908.3
+ density 1 5903.5 5909.5
Step: AIC=5545.67
cat_quality ~ alcohol + volatile.acidity
Df Deviance AIC
+ density 1 5476.5 5484.5
+ sulphates 1 5477.1 5485.1
+ residual.sugar 1 5500.5 5508.5
+ Type 1 5504.4 5512.4
+ total.sulfur.dioxide 1 5524.4 5532.4
+ pH 1 5532.9 5540.9
+ free.sulfur.dioxide 1 5533.7 5541.7
<none> 5539.7 5545.7
+ chlorides 1 5537.8 5545.8
+ citric.acid 1 5537.9 5545.9
+ fixed.acidity 1 5538.6 5546.6
Step: AIC=5484.51
cat_quality ~ alcohol + volatile.acidity + density
Df Deviance AIC
+ sulphates 1 5442.1 5452.1
+ citric.acid 1 5463.1 5473.1
+ fixed.acidity 1 5463.8 5473.8
+ total.sulfur.dioxide 1 5464.0 5474.0
+ Type 1 5465.2 5475.2
+ pH 1 5471.0 5481.0
+ free.sulfur.dioxide 1 5471.2 5481.2
+ residual.sugar 1 5472.8 5482.8
<none> 5476.5 5484.5
+ chlorides 1 5476.3 5486.3
Step: AIC=5452.14
cat_quality ~ alcohol + volatile.acidity + density + sulphates
Df Deviance AIC
+ residual.sugar 1 5419.3 5431.3
+ fixed.acidity 1 5422.1 5434.1
+ citric.acid 1 5424.1 5436.1
+ free.sulfur.dioxide 1 5432.7 5444.7
+ total.sulfur.dioxide 1 5435.6 5447.6
+ pH 1 5439.3 5451.3
+ chlorides 1 5439.4 5451.4
<none> 5442.1 5452.1
+ Type 1 5440.5 5452.5
Step: AIC=5431.3
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar
Df Deviance AIC
+ Type 1 5379.2 5393.2
+ total.sulfur.dioxide 1 5385.9 5399.9
+ citric.acid 1 5404.8 5418.8
+ pH 1 5410.7 5424.7
+ fixed.acidity 1 5415.4 5429.4
<none> 5419.3 5431.3
+ free.sulfur.dioxide 1 5417.5 5431.5
+ chlorides 1 5418.8 5432.8
Step: AIC=5393.24
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type
Df Deviance AIC
+ volatile.acidity:Type 1 5348.7 5364.7
+ total.sulfur.dioxide 1 5371.4 5387.4
+ pH 1 5371.9 5387.9
+ citric.acid 1 5372.7 5388.7
+ free.sulfur.dioxide 1 5373.9 5389.9
+ residual.sugar:Type 1 5374.6 5390.6
+ chlorides 1 5376.4 5392.4
+ density:Type 1 5377.1 5393.1
<none> 5379.2 5393.2
+ fixed.acidity 1 5378.4 5394.4
+ alcohol:Type 1 5379.2 5395.2
+ sulphates:Type 1 5379.2 5395.2
Step: AIC=5364.67
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + volatile.acidity:Type
Df Deviance AIC
+ total.sulfur.dioxide 1 5342.0 5360.0
+ residual.sugar:Type 1 5343.3 5361.3
+ free.sulfur.dioxide 1 5344.5 5362.5
+ pH 1 5344.7 5362.7
+ density:Type 1 5345.2 5363.2
+ chlorides 1 5345.8 5363.8
+ citric.acid 1 5346.2 5364.2
<none> 5348.7 5364.7
+ alcohol:Type 1 5348.3 5366.3
+ sulphates:Type 1 5348.5 5366.5
+ fixed.acidity 1 5348.6 5366.6
Step: AIC=5359.98
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + volatile.acidity:Type
Df Deviance AIC
+ total.sulfur.dioxide:Type 1 5303.1 5323.1
+ free.sulfur.dioxide 1 5323.1 5343.1
+ residual.sugar:Type 1 5337.0 5357.0
+ pH 1 5337.3 5357.3
+ chlorides 1 5338.9 5358.9
+ density:Type 1 5339.4 5359.4
+ citric.acid 1 5339.9 5359.9
<none> 5342.0 5360.0
+ alcohol:Type 1 5341.7 5361.7
+ fixed.acidity 1 5341.8 5361.8
+ sulphates:Type 1 5341.9 5361.9
Step: AIC=5323.07
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + volatile.acidity:Type +
Type:total.sulfur.dioxide
Df Deviance AIC
+ free.sulfur.dioxide 1 5286.9 5308.9
+ density:Type 1 5295.6 5317.6
+ pH 1 5299.6 5321.6
+ alcohol:Type 1 5300.2 5322.2
+ chlorides 1 5300.3 5322.3
<none> 5303.1 5323.1
+ citric.acid 1 5302.1 5324.1
+ sulphates:Type 1 5302.2 5324.2
+ residual.sugar:Type 1 5302.4 5324.4
+ fixed.acidity 1 5302.9 5324.9
Step: AIC=5308.91
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
volatile.acidity:Type + Type:total.sulfur.dioxide
Df Deviance AIC
+ density:Type 1 5281.0 5305.0
+ chlorides 1 5283.6 5307.6
+ alcohol:Type 1 5283.8 5307.8
+ pH 1 5284.1 5308.1
<none> 5286.9 5308.9
+ free.sulfur.dioxide:Type 1 5285.6 5309.6
+ citric.acid 1 5285.8 5309.8
+ residual.sugar:Type 1 5286.2 5310.2
+ sulphates:Type 1 5286.3 5310.3
+ fixed.acidity 1 5286.8 5310.8
Step: AIC=5305.04
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
volatile.acidity:Type + Type:total.sulfur.dioxide + density:Type
Df Deviance AIC
+ pH 1 5275.6 5301.6
+ residual.sugar:Type 1 5276.7 5302.7
+ chlorides 1 5277.1 5303.1
<none> 5281.0 5305.0
+ citric.acid 1 5279.1 5305.1
+ free.sulfur.dioxide:Type 1 5279.2 5305.2
+ fixed.acidity 1 5280.0 5306.0
+ alcohol:Type 1 5280.6 5306.6
+ sulphates:Type 1 5280.9 5306.9
Step: AIC=5301.61
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type
Df Deviance AIC
+ residual.sugar:Type 1 5270.3 5298.3
+ pH:Type 1 5271.9 5299.9
+ chlorides 1 5272.9 5300.9
<none> 5275.6 5301.6
+ free.sulfur.dioxide:Type 1 5274.3 5302.3
+ fixed.acidity 1 5275.0 5303.0
+ citric.acid 1 5275.1 5303.1
+ sulphates:Type 1 5275.2 5303.2
+ alcohol:Type 1 5275.4 5303.4
Step: AIC=5298.31
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type
Df Deviance AIC
+ pH:Type 1 5266.7 5296.7
+ chlorides 1 5268.0 5298.0
<none> 5270.3 5298.3
+ free.sulfur.dioxide:Type 1 5268.4 5298.4
+ fixed.acidity 1 5269.7 5299.7
+ citric.acid 1 5269.9 5299.9
+ sulphates:Type 1 5270.1 5300.1
+ alcohol:Type 1 5270.2 5300.2
Step: AIC=5296.67
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH
Df Deviance AIC
+ chlorides 1 5263.4 5295.4
+ free.sulfur.dioxide:Type 1 5263.8 5295.8
<none> 5266.7 5296.7
+ citric.acid 1 5265.9 5297.9
+ alcohol:Type 1 5266.4 5298.4
+ sulphates:Type 1 5266.6 5298.6
+ fixed.acidity 1 5266.6 5298.6
Step: AIC=5295.44
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + chlorides + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH
Df Deviance AIC
+ chlorides:Type 1 5259.3 5293.3
+ free.sulfur.dioxide:Type 1 5260.5 5294.5
<none> 5263.4 5295.4
+ sulphates:Type 1 5262.7 5296.7
+ citric.acid 1 5263.0 5297.0
+ alcohol:Type 1 5263.2 5297.2
+ fixed.acidity 1 5263.4 5297.4
Step: AIC=5293.31
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + chlorides + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH + Type:chlorides
Df Deviance AIC
+ free.sulfur.dioxide:Type 1 5256.2 5292.2
<none> 5259.3 5293.3
+ sulphates:Type 1 5257.9 5293.9
+ citric.acid 1 5258.9 5294.9
+ alcohol:Type 1 5259.3 5295.3
+ fixed.acidity 1 5259.3 5295.3
Step: AIC=5292.16
cat_quality ~ alcohol + volatile.acidity + density + sulphates +
residual.sugar + Type + total.sulfur.dioxide + free.sulfur.dioxide +
pH + chlorides + volatile.acidity:Type + Type:total.sulfur.dioxide +
density:Type + residual.sugar:Type + Type:pH + Type:chlorides +
Type:free.sulfur.dioxide
Df Deviance AIC
<none> 5256.2 5292.2
+ sulphates:Type 1 5254.7 5292.7
+ citric.acid 1 5255.9 5293.9
+ alcohol:Type 1 5256.1 5294.1
+ fixed.acidity 1 5256.2 5294.2
Call: glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
Coefficients:
(Intercept) alcohol
9.68713 0.84990
volatile.acidity density
-2.84978 -15.58322
sulphates residual.sugar
2.24908 0.03117
Type total.sulfur.dioxide
195.20143 -0.01898
free.sulfur.dioxide pH
0.02478 -0.60200
chlorides volatile.acidity:Type
-4.02470 -3.50882
Type:total.sulfur.dioxide density:Type
0.01756 -201.96697
residual.sugar:Type Type:pH
0.12808 1.50862
Type:chlorides Type:free.sulfur.dioxide
5.10775 -0.01620
Degrees of Freedom: 5196 Total (i.e. Null); 5179 Residual
Null Deviance: 6833
Residual Deviance: 5256 AIC: 5292
The forward step process dropped + sulphates:Type, fixed.acidity, alcohol:Type, and citric.acid By the hierarchical principle, the two non-interactive terms need to be added back because their have interaction terms are in the model.
model1_int<-glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
summary(model1_int)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.2378 -0.8774 0.4148 0.8069 2.4950
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.687e+00 4.539e+01 0.213 0.830982
alcohol 8.499e-01 5.665e-02 15.002 < 2e-16 ***
volatile.acidity -2.850e+00 4.243e-01 -6.716 1.87e-11 ***
density -1.558e+01 4.495e+01 -0.347 0.728824
sulphates 2.249e+00 3.032e-01 7.418 1.19e-13 ***
residual.sugar 3.117e-02 5.020e-02 0.621 0.534702
Type 1.952e+02 5.575e+01 3.501 0.000463 ***
total.sulfur.dioxide -1.897e-02 2.917e-03 -6.505 7.77e-11 ***
free.sulfur.dioxide 2.478e-02 8.602e-03 2.880 0.003971 **
pH -6.020e-01 4.860e-01 -1.239 0.215502
chlorides -4.025e+00 1.527e+00 -2.636 0.008391 **
volatile.acidity:Type -3.509e+00 6.200e-01 -5.659 1.52e-08 ***
Type:total.sulfur.dioxide 1.756e-02 3.200e-03 5.487 4.09e-08 ***
density:Type -2.020e+02 5.584e+01 -3.617 0.000299 ***
residual.sugar:Type 1.281e-01 5.214e-02 2.456 0.014036 *
Type:pH 1.509e+00 5.610e-01 2.689 0.007159 **
Type:chlorides 5.108e+00 2.438e+00 2.095 0.036133 *
Type:free.sulfur.dioxide -1.620e-02 9.147e-03 -1.771 0.076642 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5256.2 on 5179 degrees of freedom
AIC: 5292.2
Number of Fisher Scoring iterations: 5
##evaluating model
model1_int_AIC_train <- model1_int$aic
##predicted quality for test data based on training data
preds<-predict(model1_int,newdata=test, type="response")
model1_int_error <- table(test$cat_quality, preds>0.7)
#Curves
evulation_summary_1int <- data.frame(
attempt = 'model1_int',
AIC = model1_int_AIC_train,
PRESS = get_press(model1_int),
'False positive' = round(model1_int_error[3]/(model1_int_error[1]+model1_int_error[3]),3),
'False negative' = round(model1_int_error[2]/(model1_int_error[2]+model1_int_error[4]),3),
'Error Rate' = round((model1_int_error[2]+model1_int_error[3])/(model1_int_error[1]+model1_int_error[2]+model1_int_error[3]+model1_int_error[4]),3)
)
evulation_summary <- rbind(evulation_summary,evulation_summary_1int)
evulation_summary
compare_models<-rbind(compare_models,evulation_summary_1int)
compare_models
NA
compare_models<-compare_models%>%
rename(
Model = attempt
)
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.0 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
Attaching package: ‘data.table’
The following objects are masked from ‘package:reshape2’:
dcast, melt
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
library(dplyr)
library(formattable)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(tidyr)
customGreen0 = "#DeF7E9"
customGreen = "#71CA97"
customRed = "#ff7f7f"
formattable(compare_models,align =c("l","c", "c", "c", "c", "r"))
NA
Creating the ROC curves and AUC for the 3 new models.
# model1_Red
preds<-predict(model1_Red,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_Red")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_Red_auc <- auc@y.values
# model1_White
preds<-predict(model1_White,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_White")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_White_auc <- auc@y.values
# model1_int
preds<-predict(model1_int,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_int")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_int_auc <- auc@y.values
##create heat map Consolidated
ggplot(data = melted_cor_train, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Consolidated (Both Red and White)')
##create heat map White
ggplot(data = melted_cor_train_white, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'White Wine')
##create heat map Red
ggplot(data = melted_cor_train_Red, aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), space = "Lab", name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1))+
coord_fixed()+
labs(title = 'Red Wine')
ggplot(train_with_qual, mapping = aes(x=quality, fill=Type))+
geom_histogram(binwidth=1, alpha=.4, position="identity", color="black")+
geom_vline(aes(xintercept=5.5, color="red"),
linetype="dashed")+
scale_color_manual(name = "Cut Off", values = c("red"))+
labs(x="Quality",
y="Frequency",
title="Distribution of Quality Rating by Wine Type")
This is the table for showing the evaluation for the first model
formattable(evulation_summary[1,])
summary(model1_Red)
summary(model1_White)
summary(model1_int)
Call:
glm(formula = cat_quality ~ alcohol + volatile.acidity + density +
sulphates + residual.sugar + Type + total.sulfur.dioxide +
free.sulfur.dioxide + pH + chlorides + volatile.acidity:Type +
Type:total.sulfur.dioxide + density:Type + residual.sugar:Type +
Type:pH + Type:chlorides + Type:free.sulfur.dioxide, family = "binomial",
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.2378 -0.8774 0.4148 0.8069 2.4950
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 9.687e+00 4.539e+01 0.213 0.830982
alcohol 8.499e-01 5.665e-02 15.002 < 2e-16 ***
volatile.acidity -2.850e+00 4.243e-01 -6.716 1.87e-11 ***
density -1.558e+01 4.495e+01 -0.347 0.728824
sulphates 2.249e+00 3.032e-01 7.418 1.19e-13 ***
residual.sugar 3.117e-02 5.020e-02 0.621 0.534702
Type 1.952e+02 5.575e+01 3.501 0.000463 ***
total.sulfur.dioxide -1.897e-02 2.917e-03 -6.505 7.77e-11 ***
free.sulfur.dioxide 2.478e-02 8.602e-03 2.880 0.003971 **
pH -6.020e-01 4.860e-01 -1.239 0.215502
chlorides -4.025e+00 1.527e+00 -2.636 0.008391 **
volatile.acidity:Type -3.509e+00 6.200e-01 -5.659 1.52e-08 ***
Type:total.sulfur.dioxide 1.756e-02 3.200e-03 5.487 4.09e-08 ***
density:Type -2.020e+02 5.584e+01 -3.617 0.000299 ***
residual.sugar:Type 1.281e-01 5.214e-02 2.456 0.014036 *
Type:pH 1.509e+00 5.610e-01 2.689 0.007159 **
Type:chlorides 5.108e+00 2.438e+00 2.095 0.036133 *
Type:free.sulfur.dioxide -1.620e-02 9.147e-03 -1.771 0.076642 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6833.2 on 5196 degrees of freedom
Residual deviance: 5256.2 on 5179 degrees of freedom
AIC: 5292.2
Number of Fisher Scoring iterations: 5
Table right above the “Best Possible Model (Reduced_4)” section.
formattable(compare_models,align =c("l","c", "c", "c", "c", "r"))
This is the table for showing the best models (top five)
formattable(res.best.logistic$BestModels)
Add ROC for reduced_1, model1_Red, model1_White, model1_int
preds<-predict(reduced_1,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_1")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_1_auc <- auc@y.values
# model1_Red
preds<-predict(model1_Red,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_Red")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_Red_auc <- auc@y.values
# model1_White
preds<-predict(model1_White,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_White")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_White_auc <- auc@y.values
# model1_int
preds<-predict(model1_int,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for model1_int")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
model1_int_auc <- auc@y.values
This is the table for showing the best models (top five)
formattable(evulation_summary[2,])
This is the table for reduced_4 VIF.
formattable(data.frame(reg_4_VIF_test), align =c("l","r"))
This is the next VIF plot in the report
formattable(data.frame(reg_4_2_VIF_test), align =c("l","r"))
The table below that. It is the evaluation summary for reduced_4_2
formattable(evulation_summary[3,])
evaluation summary for the outlier/leverage/etc.
formattable(evulation_summary[4:7,])
add roc curves for these four.
## reduced_4
preds<-predict(reduced_4,newdata=test, type="response")
Error: variable 'Type' was fitted with type "numeric" but type "factor" was supplied
## reduced_4_2
preds<-predict(reduced_4_2,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_2")
lines(x = c(0,1), y = c(0,1), col="red")
auc4_2<-performance(rates, measure = "auc")
reduced_4_2_auc <- auc4_2@y.values
## reduced_4_3
preds<-predict(reduced_4_3,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_3")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_3_auc <- auc@y.values
## reduced_4_4_lev
preds<-predict(reduced_4_4_lev,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_4_lev")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_4_lev_auc <- auc@y.values
## reduced_4_5_DDFFITS
preds<-predict(reduced_4_5_DDFFITS,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_5_DDFFITS")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_5_DDFFITS_auc <- auc@y.values
## reduced_4_6_no_special
preds<-predict(reduced_4_6_no_special,newdata=test, type="response")
rates<-prediction(preds, test$cat_quality)
roc_result<-performance(rates,measure="tpr", x.measure="fpr")
plot(roc_result, main="ROC Curve for reduced_4_6_no_special")
lines(x = c(0,1), y = c(0,1), col="red")
auc<-performance(rates, measure = "auc")
reduced_4_6_no_special_auc <- auc@y.values
AUC_summary <- data.frame('reduced_1'=reduced_1_auc,
'reduced_4'=reduced_4_auc,
'reduced_4_2'=reduced_4_2_auc,
'reduced_4_3'=reduced_4_3_auc,
'reduced_4_4_lev'=reduced_4_4_lev_auc,
'reduced_4_5_DDFFITS'=reduced_4_5_DDFFITS_auc,
'reduced_4_6_no_special'=reduced_4_6_no_special_auc)
colnames(AUC_summary) <- c('reduced_1','reduced_4','reduced_4_2','reduced_4_3','reduced_4_4_lev','reduced_4_5_DDFFITS','reduced_4_6_no_special')
AUC_summary